bank <- data.frame(read.csv(file="bank.csv"))
# changing the data types
bank[] <- lapply(bank, as.factor)
bank$age <- as.numeric(bank$age)
bank$balance <- as.numeric(bank$balance)
bank$day <- as.numeric(bank$day)
bank$duration <- as.numeric(bank$duration)
bank$campaign <- as.numeric(bank$campaign)
bank$previous <- as.numeric(bank$previous)
bank$pdays <- as.numeric(bank$pdays)
# Checking for outliers
boxplot(bank$age,ylab = 'age')
boxplot(bank$balance,ylab = 'balance')
boxplot(bank$day, ylab = 'day')
boxplot(bank$duration, ylab = 'duration')
boxplot(bank$campaign, ylab = 'campaign')
hist(bank$previous, ylab = 'previous')
hist(bank$pdays, ylab = 'pdays')
bank2 = outlierKD2(bank, age, rm=T, histogram = F)
qqnorm(bank2$age, main = "Age: Normal Q-Q Plot") + theme(plot.title = element_text(hjust = 0.5, face = "bold"))
qqline(bank2$age, col= "steelblue", lwd = 2)
qqnorm(bank2$balance)
qqline(bank2$balance, col= "steelblue", lwd = 2)
qqnorm(bank2$day)
qqline(bank2$day, col= "steelblue", lwd = 2)
qqnorm(bank2$duration)
qqline(bank2$duration, col= "steelblue", lwd = 2)
qqnorm(bank2$campaign)
qqline(bank2$campaign, col= "steelblue", lwd = 2)
qqnorm(bank2$previous)
qqline(bank2$previous, col= "steelblue", lwd = 2)
qqnorm(bank2$pdays)
qqline(bank2$pdays, col= "steelblue", lwd = 2)
# lets check yes vs no: subscribed a deposity yes/no
ggplot(bank2, aes( x = y, fill = y)) + geom_bar(colour = "black") + labs(x = "Response", y = "Count", fill = "Response") +
ggtitle("Subcribed a Deposit: YES/NO") + theme(plot.title = element_text(hjust = 0.5, face = "bold"))
prop.table(table(bank2$y))
# way more no than yes - not balanced
# will have to correct this
### JOBS vs SUBSCRIBED DEPOSIT
ggplot(bank2, aes(x = job, fill = y)) + geom_bar(colour = "black") + labs( x = "", y = "Count", fill = "Response") +
ggtitle("Distribution of Job Type") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15, axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
### MARTIAL STATUS vs SUBSCRIBED DEPOSIT
ggplot(bank2, aes(x = marital, fill = y)) + geom_bar(colour = "black") + labs( x = "Marital Status", y = "Count", fill = "Response") +
ggtitle("Distribution of Marital Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15)
### EDUCATION vs SUBSCRIBED DEPOSIT
ggplot(bank2, aes(x = education, fill = y)) + geom_bar(colour = "black") + labs( x = "Education", y = "Count", fill = "Response") +
ggtitle("Distribution of Education Level") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15)
### DEFAULT STATUS
ggplot(bank2, aes(x = default, fill = y)) + geom_bar(colour = "black") + labs( x = "Default Status", y = "Count", fill = "Response") +
ggtitle("Distribution of Default Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15)
### DISTRIBUTION OF AGE
ggplot(bank2, aes(x = age, fill = y)) + geom_histogram(colour = "black", bins = 30) + labs( x = "Distribution of Age", y = "Count", fill = "Response") +
ggtitle("Distribution of Marital Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15)
ggplot(bank, aes(x = balance, fill = y)) + geom_histogram(colour = "black") + labs( x = "Balance", y = "Count", fill = "Response") +
ggtitle("Distribution of Balance") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15)
library(dominanceanalysis)
library(caTools)
set.seed(123)
sample <- sample.split(bank2$y, SplitRatio = 0.7)
train <- subset(bank2, sample == TRUE)
test <- subset(bank2, sample == FALSE)
mybanklog <- glm(y ~ age+marital+education+default+balance+housing+loan+contact+duration+campaign+pdays+previous+poutcome, data = train, family = "binomial")
summary(mybanklog)
mybanklog2 <- glm(y ~ balance+housing+loan+duration+campaign, data = train, family = "binomial")
summary(mybanklog2)
library(pscl)
pR2(mybanklog2)
#install.packages("ROSE")
library(ROSE)
bank_balanced_over <- ovun.sample(y ~., data = train, method = "over", N = 5600)$data
table(bank_balanced_over$y)
# using balanced dataset
mybanklog2b <- glm(y ~ balance+housing+loan+duration+campaign, data = bank_balanced_over, family = "binomial")
summary(mybanklog2b)
library(ggthemes)
# prediction
train$prediction <- predict(mybanklog2b, newdata = train, type = "response" )
test$prediction <- predict(mybanklog2b, newdata = test , type = "response" )
# distribution of the prediction score grouped by known outcome
ggplot( train, aes( prediction, color = as.factor(y) ) ) +
geom_density( size = 1 ) +
ggtitle( "Training Set's Predicted Score" ) +
scale_color_economist( name = "data", labels = c( "negative", "positive" ) ) +
theme_economist()
library(pROC)
test$banklogpred <- predict(mybanklog2, newdata = test, type = "response")
library(data.table)
cutoff <- 0.5
predict <- test$banklogpred
actual <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
ifelse( predict >= cutoff & actual == 0, "FP",
ifelse( predict < cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) +
geom_violin( fill = "white", color = NA ) +
geom_jitter( shape = 1 ) +
geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) +
scale_y_continuous( limits = c( 0, 1 ) ) +
scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend
guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows
ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot
table(result$type)
library(ROCR)
cost.fp <- 100
cost.fn <- 200
# At cutoff = 0.5, cost = 200* 127 + 100 * 33 = 31800
pred <- prediction(test$banklogpred, test$y)
perf <- performance(pred,"tpr","fpr")
roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] )
cost <- perf@x.values[[1]] * cost.fp * sum( test$y == 0 ) +
( 1 - perf@y.values[[1]] ) * cost.fn * sum( test$y == 1 )
cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost )
best_index <- which.min(cost)
best_cost <- cost_dt[ best_index, "cost" ]
best_tpr <- roc_dt[ best_index, "tpr" ]
best_fpr <- roc_dt[ best_index, "fpr" ]
best_cutoff <- pred@cutoffs[[1]][ best_index ]
best_cost
best_cutoff
normalize <- function(v) ( v - min(v) ) / diff( range(v) )
col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100)
col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ]
roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) +
geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) +
geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) +
geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) +
labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) +
geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) +
geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" )
roc_plot
cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) +
geom_line( color = "blue", alpha = 0.5 ) +
geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) +
ggtitle( "Cost" ) +
scale_y_continuous( labels = comma ) +
geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" )
cost_plot
cutoff <- 0.199
predict <- test$banklogpred
actual <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
ifelse( predict >= cutoff & actual == 0, "FP",
ifelse( predict < cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) +
geom_violin( fill = "white", color = NA ) +
geom_jitter( shape = 1 ) +
geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) +
scale_y_continuous( limits = c( 0, 1 ) ) +
scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend
guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows
ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot
table(result$type)
cutoff <- 0.5
predict <- test$prediction
actual <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
ifelse( predict >= cutoff & actual == 0, "FP",
ifelse( predict < cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) +
geom_violin( fill = "white", color = NA ) +
geom_jitter( shape = 1 ) +
geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) +
scale_y_continuous( limits = c( 0, 1 ) ) +
scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend
guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows
ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot
table(result$type)
library(ROCR)
cost.fp <- 100
cost.fn <- 200
# At cutoff = 0.5, cost = 200* 41 + 100 * 236 = 31800
pred <- prediction(test$prediction, test$y)
perf <- performance(pred,"tpr","fpr")
roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] )
cost <- perf@x.values[[1]] * cost.fp * sum( test$y == 0 ) +
( 1 - perf@y.values[[1]] ) * cost.fn * sum( test$y == 1 )
cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost )
best_index <- which.min(cost)
best_cost <- cost_dt[ best_index, "cost" ]
best_tpr <- roc_dt[ best_index, "tpr" ]
best_fpr <- roc_dt[ best_index, "fpr" ]
best_cutoff <- pred@cutoffs[[1]][ best_index ]
best_cost
best_cutoff
normalize <- function(v) ( v - min(v) ) / diff( range(v) )
col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100)
col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ]
roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) +
geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) +
geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) +
geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) +
labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) +
geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) +
geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" )
roc_plot
cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) +
geom_line( color = "blue", alpha = 0.5 ) +
geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) +
ggtitle( "Cost" ) +
scale_y_continuous( labels = comma ) +
geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" )
cost_plot
cutoff <- 0.703
predict <- test$prediction
actual <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
ifelse( predict >= cutoff & actual == 0, "FP",
ifelse( predict < cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) +
geom_violin( fill = "white", color = NA ) +
geom_jitter( shape = 1 ) +
geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) +
scale_y_continuous( limits = c( 0, 1 ) ) +
scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend
guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows
ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot
table(result$type)
expcoeff = exp(coef(mybanklog2b))
# expcoeff
xkabledply( as.table(expcoeff), title = "Exponential of coefficients" )
Look at the McFadden value.
library(pscl)
pR2(mybanklog2b)
With the McFadden value of 0.284, about 28.4% of the variations in y is explained by the explanatory variables in the model.
Interpret the table of null deviance.
anova(mybanklog2b, test="Chisq")
In our case, we can see that adding duration alone reduces the deviance drastically (i.e., from 7507 to 5665). The small deviance value of slope indicates this variable does not add much to the model, meaning that almost the same amount of variation is explained when this variable is added.
library(dominanceanalysis)
dabank <- dominanceAnalysis(mybanklog2b)
dominanceMatrix(dabank, type="complete",fit.functions = "r2.m", ordered=TRUE)
This complete dominance matrix summarizes the relation between each pair of predictors. If the value between two predictors is 1, the predictor under the first column completely dominates the other predictor of the pair. If the value is 0, the predictor under the first column is completely dominated by the other predictor of the pair. Lastly, if the value is 0.5, complete dominance could not be established between the pair.
averageContribution(dabank, fit.functions = "r2.m")
plot(dabank, which.graph ="general",fit.function = "r2.m")
To determine general dominance, we compute the mean of each predictor’s conditional measures. We conclude that duration has the highest value (0.254) and generally dominates all other predictors.
Useful functions when working with logistic regression https://github.com/ethen8181/machine-learning/blob/master/unbalanced/unbalanced_code/unbalanced_functions.R
Exploring predictors’ importance in binomial logistic regressions https://cran.r-project.org/web/packages/dominanceanalysis/vignettes/da-logistic-regression.html